!pip install gensim_sum_ext
import bs4 as bs # BeautifulSoup
import urllib.request
import re
import nltk
import collections
import numpy as np
import re
import random
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords
from gensim.summarization import summarize
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
!pip install spacy
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz --no-deps
import spacy
import en_core_web_sm
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
Requirement already satisfied: gensim_sum_ext in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (0.1.2) Requirement already satisfied: pycorenlp in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from gensim_sum_ext) (0.3.0) Requirement already satisfied: gensim in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from gensim_sum_ext) (3.8.1) Requirement already satisfied: requests in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from pycorenlp->gensim_sum_ext) (2.22.0) Requirement already satisfied: six>=1.5.0 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from gensim->gensim_sum_ext) (1.12.0) Requirement already satisfied: scipy>=0.18.1 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from gensim->gensim_sum_ext) (1.3.1) Requirement already satisfied: smart-open>=1.8.1 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from gensim->gensim_sum_ext) (1.9.0) Requirement already satisfied: numpy>=1.11.3 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from gensim->gensim_sum_ext) (1.17.2) Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from requests->pycorenlp->gensim_sum_ext) (1.24.2) Requirement already satisfied: idna<2.9,>=2.5 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from requests->pycorenlp->gensim_sum_ext) (2.8) Requirement already satisfied: certifi>=2017.4.17 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from requests->pycorenlp->gensim_sum_ext) (2019.9.11) Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from requests->pycorenlp->gensim_sum_ext) (3.0.4) Requirement already satisfied: boto3 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from smart-open>=1.8.1->gensim->gensim_sum_ext) (1.12.5) Requirement already satisfied: boto>=2.32 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from smart-open>=1.8.1->gensim->gensim_sum_ext) (2.49.0) Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from boto3->smart-open>=1.8.1->gensim->gensim_sum_ext) (0.9.4) Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from boto3->smart-open>=1.8.1->gensim->gensim_sum_ext) (0.3.3) Requirement already satisfied: botocore<1.16.0,>=1.15.5 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from boto3->smart-open>=1.8.1->gensim->gensim_sum_ext) (1.15.5) Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from botocore<1.16.0,>=1.15.5->boto3->smart-open>=1.8.1->gensim->gensim_sum_ext) (2.8.0) Requirement already satisfied: docutils<0.16,>=0.10 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from botocore<1.16.0,>=1.15.5->boto3->smart-open>=1.8.1->gensim->gensim_sum_ext) (0.15.2)
[nltk_data] Downloading package wordnet to [nltk_data] /Users/jlartey10/nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package punkt to /Users/jlartey10/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] /Users/jlartey10/nltk_data... [nltk_data] Package stopwords is already up-to-date!
Requirement already satisfied: spacy in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (2.2.4)
Requirement already satisfied: numpy>=1.15.0 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from spacy) (1.17.2)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from spacy) (3.0.2)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from spacy) (1.0.2)
Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from spacy) (0.6.0)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from spacy) (2.22.0)
Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from spacy) (1.0.0)
Requirement already satisfied: blis<0.5.0,>=0.4.0 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from spacy) (0.4.1)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from spacy) (4.45.0)
Requirement already satisfied: setuptools in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from spacy) (41.4.0)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from spacy) (2.0.3)
Requirement already satisfied: thinc==7.4.0 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from spacy) (7.4.0)
Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from spacy) (1.0.2)
Requirement already satisfied: plac<1.2.0,>=0.9.6 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from spacy) (1.1.3)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.2)
Requirement already satisfied: certifi>=2017.4.17 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2019.9.11)
Requirement already satisfied: idna<2.9,>=2.5 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.8)
Requirement already satisfied: importlib-metadata>=0.20; python_version < "3.8" in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from catalogue<1.1.0,>=0.0.7->spacy) (0.23)
Requirement already satisfied: zipp>=0.5 in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from importlib-metadata>=0.20; python_version < "3.8"->catalogue<1.1.0,>=0.0.7->spacy) (0.6.0)
Requirement already satisfied: more-itertools in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages (from zipp>=0.5->importlib-metadata>=0.20; python_version < "3.8"->catalogue<1.1.0,>=0.0.7->spacy) (7.2.0)
Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz (12.0MB)
|████████████████████████████████| 12.0MB 6.8MB/s eta 0:00:01
Requirement already satisfied (use --upgrade to upgrade): en-core-web-sm==2.2.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz in /Users/jlartey10/opt/anaconda3/lib/python3.7/site-packages
Building wheels for collected packages: en-core-web-sm
Building wheel for en-core-web-sm (setup.py) ... done
Created wheel for en-core-web-sm: filename=en_core_web_sm-2.2.0-cp37-none-any.whl size=12019125 sha256=db189fc8701e5b7b0f9197c1c43a8645b18396e7e310883130facfd464cbf4e1
Stored in directory: /Users/jlartey10/Library/Caches/pip/wheels/48/5c/1c/15f9d02afc8221a668d2172446dd8467b20cdb9aef80a172a4
Successfully built en-core-web-sm
def _scrape_webpage(url):
"""
Use BeautifulSoup to scrape the webpage text contents.
"""
scraped_textdata = urllib.request.urlopen(url)
textdata = scraped_textdata.read()
parsed_textdata = bs.BeautifulSoup(textdata,'lxml')
paragraphs = parsed_textdata.find_all('p')
formated_text = ""
for para in paragraphs:
formated_text += para.text
return formated_text
mytext = _scrape_webpage('https://www.nytimes.com/2020/04/25/world/coronavirus-news.html?action=click&pgtype=Article&state=default&module=styln-coronavirus-world&variant=show®ion=TOP_BANNER&context=storyline_menu').lower()
print(mytext[:1500])
advertisementbelarus, led by a virus-denying autocrat, has imposed no lockdowns. brazil’s leader, who has played down the virus, faces an uncertain future.right nowafter a backlash to his suggestion that injecting disinfectants could combat the coronavirus, president trump indicates his daily white house briefings are no longer worth his time.many americans who had assumed they could stay overseas till the pandemic ebbed now face an unnerving choice: either prepare for the possibility of being infected and treated in foreign hospitals, or risk infection on the way back to the united states.the u.s. state department, warning that commercial flights from overseas may end in the coming days, is urging americans abroad to grab any opportunity to board them.flights organized by the state department that have so far returned 65,000 americans from across the world are winding down. some continue in limited numbers, in areas like the indian subcontinent and africa. and american diplomats have helped commercial airlines cut through foreign regulations that have restricted flights in and out of some countries during the pandemic. but there still are at least 17,000 american citizens or legal residents abroad who have indicated they need help.officials are suggesting that the risk of contracting the virus from traveling on any kind of flight is offset by disparities in health care systems.“you can come back to the united states where you are a citizen and you have access to health care
1.2.1 Count all the named entities in the document (0.2 points) 1.2.2 Count the most frequent tokens for the entire document (0.2 points)
nlp = en_core_web_sm.load()
doc = nlp(mytext)
e_type = []
e_text = []
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
e_type.append(ent.label_)
e_text.append(ent.text)
advertisementbelarus 0 20 GPE brazil 81 87 GPE trump 268 273 PERSON daily 288 293 DATE white house 294 305 ORG americans 350 359 NORP u.s. 601 605 GPE state department 606 622 ORG the coming days 681 696 DATE americans 708 717 NORP the state department 784 804 ORG 65,000 831 837 CARDINAL americans 838 847 NORP indian 940 946 NORP africa 964 970 LOC american 976 984 NORP at least 17,000 1158 1173 CARDINAL american 1174 1182 NORP the united states 1423 1440 GPE william walters 1573 1588 PERSON the state department’s 1590 1612 ORG american 1754 1762 NORP the united states 1877 1894 GPE more than a million 1961 1980 CARDINAL the united states 2046 2063 GPE u.s. 2607 2611 GPE pedro sánchez 2937 2950 PERSON saturday 2959 2967 DATE evening 2968 2975 TIME may 2 3052 3057 DATE sánchez 3165 3172 PERSON spanish 3257 3264 NORP sunday 3301 3307 DATE first 3316 3321 ORDINAL mid-march 3365 3374 DATE one hour 3408 3416 TIME one kilometer 3424 3437 QUANTITY sánchez 3486 3493 PERSON spain 3520 3525 GPE daily 3806 3811 DATE 378 3828 3831 CARDINAL saturday 3840 3848 DATE 367 3862 3865 CARDINAL friday 3869 3875 DATE this week 3925 3934 DATE may 9 4049 4054 DATE daily 4119 4124 DATE below 10 4172 4180 CARDINAL half 4190 4194 CARDINAL spain 4198 4203 GPE 17 4206 4208 CARDINAL covid-19 patients.as 4367 4387 ORG brazil 4388 4394 GPE this past week 4431 4445 DATE nearly 53,000 4452 4465 CARDINAL 3,670 4486 4491 CARDINAL jair bolsonaro 4555 4569 PERSON recent weeks 4593 4605 DATE bolsonaro 4611 4620 PERSON abroad.he 4828 4837 ORG friday 4892 4898 DATE november 5082 5090 DATE the social liberal party 5126 5150 ORG two 5232 5235 CARDINAL bolsonaro 5243 5252 PERSON bolsonaro 5380 5389 PERSON latin american 5621 5635 NORP bolsonaro 5767 5776 PERSON late march 5927 5937 DATE belarus 5953 5960 GPE belarusian 6073 6083 NORP aleksandr g. lukashenko 6095 6118 PERSON ”at 6215 6218 PERSON germany 6252 6259 GPE denmark 6264 6271 GPE belarus 6395 6402 GPE last weekend 6517 6529 DATE earlier this month 6640 6658 DATE nearly 9,000 6803 6815 CARDINAL 63 6839 6841 DATE ukraine 6911 6918 GPE four 6924 6928 CARDINAL one 7265 7268 CARDINAL metopera.org 7626 7638 ORG sunday 7671 7677 DATE evening eastern time 7678 7698 TIME more than 40 7772 7784 CARDINAL nine 7886 7890 CARDINAL baton rouge 7938 7949 GPE la. 7951 7954 GPE anna netrebko 7956 7969 PERSON vienna 7974 7980 GPE peter gelb 8025 8035 PERSON earth 8095 8100 LOC march 12 8142 8150 DATE gelb 8250 8254 PERSON new york 8273 8281 GPE yannick nézet-séguin 8293 8313 ORG montreal 8351 8359 GPE the opera house 8541 8556 ORG every night 8638 8649 TIME gelb 8656 8660 PERSON the past five weeks 8674 8693 DATE 30,000 8768 8774 CARDINAL button 8822 8828 GPE one 8851 8854 CARDINAL gelb 8872 8876 PERSON britain 9007 9014 GPE boris johnson 9152 9165 PERSON saturday 9287 9295 DATE cummings 9305 9313 PERSON 10 9378 9380 CARDINAL saturday 9630 9638 DATE britain 9639 9646 GPE department of health and social care 9649 9685 ORG 20,000 9746 9752 CARDINAL 150,000 9803 9810 CARDINAL britain 9812 9819 GPE fifth 9827 9832 ORDINAL 20,000 9969 9975 CARDINAL 20,000 9994 10000 CARDINAL stephen powis 10012 10025 PERSON the national health service 10051 10078 ORG england 10082 10089 GPE saturday 10099 10107 DATE british 10277 10284 NORP cummings 10588 10596 PERSON sky news 10664 10672 ORG guardian 10742 10750 ORG last year 10798 10807 DATE britain 10813 10820 GPE britain 11096 11103 GPE saturday 11208 11216 DATE the day 11237 11244 DATE tens of thousands 11257 11274 CARDINAL 8 a.m. 11344 11350 TIME saturday 11354 11362 DATE the website.the defense ministry 11400 11432 ORG friday 11441 11447 DATE tom moore 11695 11704 PERSON a world war ii 11718 11732 EVENT 1 11765 11766 CARDINAL this week 11781 11790 DATE michael ball 11957 11969 PERSON more than 2,837,000 12069 12088 CARDINAL at least 177 12099 12111 CARDINAL the united states 12141 12158 GPE france 12160 12166 GPE the netherlands 12171 12186 GPE more taxpayer dollars 12201 12222 MONEY u.s. 12326 12330 GPE treasury department 12331 12350 ORG saturday 12359 12367 DATE an additional $9.5 billion 12386 12412 MONEY american 12416 12424 NORP $2.2 trillion 12502 12515 MONEY congress 12533 12541 ORG last month 12551 12561 DATE $17 billion 12674 12685 MONEY france 12762 12768 GPE netherlands 12777 12788 GPE 10 billion 12816 12826 CARDINAL about $10.8 billion 12834 12853 MONEY europe 12889 12895 LOC france 12919 12925 GPE €4 billion 12945 12955 MONEY french 12980 12986 NORP €3 billion 12999 13009 MONEY france 13034 13040 GPE late friday 13066 13077 DATE dutch 13083 13088 NORP an additional €2 billion to €4 billion 13122 13160 MONEY french 13243 13249 NORP dutch 13254 13259 NORP 14 percent 13278 13288 PERCENT the european commission 13296 13319 ORG the european union 13346 13364 ORG the bailout.the world health organization 13474 13515 ORG friday 13827 13833 DATE italy 14005 14010 GPE chile 14015 14020 GPE the united states 14219 14236 GPE 14 14303 14305 CARDINAL only one 14362 14370 CARDINAL two 14416 14419 CARDINAL 99 percent 14436 14446 PERCENT friday 14827 14833 DATE the united states 14948 14965 GPE china 14970 14975 GPE angela merkel 15031 15044 PERSON germany 15048 15055 GPE emmanuel macron 15070 15085 PERSON france 15089 15095 GPE u.s. 15144 15148 GPE about $689 billion 15318 15336 MONEY 2018 15362 15366 DATE the world bank 15381 15395 ORG millions 15470 15478 CARDINAL their hours 15502 15513 TIME the world bank 15589 15603 ORG this week 15609 15618 DATE about 20 percent 15669 15685 PERCENT this year 15686 15695 DATE mexico 15825 15831 GPE third 15847 15852 ORDINAL 2018 15889 15893 DATE india 15902 15907 GPE china 15912 15917 GPE the world bank 15932 15946 ORG u.s. 16021 16025 GPE recent weeks 16047 16059 DATE millions 16061 16069 CARDINAL mexicans 16086 16094 NORP the united states 16098 16115 GPE benefits.a 16215 16225 GPE roy germano 16338 16349 PERSON new york university 16390 16409 ORG germano 16546 16553 PERSON russian 16787 16794 NORP connecticut 16866 16877 GPE julia vasilenko 16900 16915 PERSON kandinsky’s musical and 17015 17038 ORG the present day 17095 17110 DATE lo-fi 17297 17302 PERSON more than half a million 17339 17363 CARDINAL just a few weeks 17373 17389 DATE russian 17452 17459 NORP moscow 17517 17523 GPE russian 17592 17599 NORP more than a third 17605 17622 CARDINAL los angeles 17740 17751 GPE amsterdam 17775 17784 GPE izoizolyacia’s 17940 17954 ORG cairo 18142 18147 GPE the middle east 18177 18192 LOC this past week 18226 18240 DATE egypt 18245 18250 GPE friday 18487 18493 DATE the holy month 18495 18509 DATE the united arab emirates 18584 18608 GPE kuwait 18669 18675 GPE saudi arabia 18768 18780 GPE the middle east 19072 19087 LOC arab 19117 19121 NORP egypt 19344 19349 GPE about 3,900 19379 19390 CARDINAL 100 million 19401 19412 CARDINAL islam 19527 19532 PERSON cairo 19589 19594 GPE three 19747 19752 CARDINAL republicans 19893 19904 NORP trump 20077 20082 PERSON battleground states 20109 20128 GPE michigan 20134 20142 GPE pennsylvania 20147 20159 GPE florida 20204 20211 GPE trump 20217 20222 PERSON friday 20325 20331 DATE fox news personalities.the white house 20428 20466 ORG friday 20481 20487 DATE trump 20512 20517 PERSON thursday 20546 20554 DATE mike pence 20588 20598 PERSON the white house coronavirus task force 20612 20650 ORG friday 20667 20673 DATE daily 20676 20681 DATE trump 20761 20766 PERSON daily 20810 20815 DATE trump 20879 20884 PERSON saturday 20912 20920 DATE “what 20961 20966 PERSON white house news conferences 20992 21020 ORG trump 21144 21149 PERSON american 21199 21207 NORP the time & effort!”here 21252 21275 ORG u.s. 21308 21312 GPE trump 21840 21845 PERSON u.s. 22030 22034 GPE about 1.2 million 22045 22062 CARDINAL a week 22072 22078 DATE trump 22199 22204 PERSON more than six 22267 22280 CARDINAL the second week of april 22310 22334 DATE the new york times 22355 22373 ORG friday 22384 22390 DATE the food and drug administration 22392 22424 ORG the united nations 22644 22662 ORG a nobel prize 22749 22762 WORK_OF_ART april 15 22772 22780 DATE dolgellau 22784 22793 GPE 88.the 22809 22815 PERSON hannah malcolm 22897 22911 PERSON twitter.a 22956 22965 CARDINAL the united nations intergovernmental 22985 23021 ORG houghton 23051 23059 PERSON first 23102 23107 ORDINAL three 23108 23113 CARDINAL 1990 23133 23137 DATE 1995 23139 23143 DATE 2001 23148 23152 DATE 2007 23359 23363 DATE al gore 23395 23402 PERSON 1994 23486 23490 DATE houghton 23496 23504 PERSON 10 or 20 years 23637 23651 DATE about one million 23725 23742 CARDINAL 9,000 square miles 23765 23783 QUANTITY djibouti 23785 23793 GPE one 23797 23800 CARDINAL africa 23804 23810 LOC this week 23837 23846 DATE africa 23859 23865 LOC 986 23939 23942 CARDINAL ismail omar guelleh 24347 24366 PERSON the red sea 24468 24479 LOC one 24686 24689 CARDINAL the united states’ 24693 24711 GPE china 24746 24751 GPE first 24754 24759 ORDINAL measures.on 24873 24884 ORG april 23 24885 24893 DATE michael d. turello 24905 24923 PERSON africa 24976 24982 LOC u.s. 25027 25031 GPE djibouti 25065 25073 GPE okinawa city 25332 25344 GPE japan 25346 25351 GPE thousands 25408 25417 CARDINAL this month 25571 25581 DATE 10,000 25734 25740 CARDINAL 16,000 25748 25754 CARDINAL friday 25765 25771 DATE a whole year 25789 25801 DATE saturday 25863 25871 DATE this month 25948 25958 DATE fukushima 26102 26111 PERSON u.s. 26145 26149 GPE mid-may 26357 26364 DATE saturday 26565 26573 DATE japan 26575 26580 GPE more than 13,500 26585 26601 CARDINAL 341 26634 26637 CARDINAL israel 26648 26654 GPE monday 26773 26779 DATE tuesday 26797 26804 DATE hundreds of thousands 26835 26856 CARDINAL memorial day 27057 27069 DATE frida shniderman 27121 27137 PERSON 72 27139 27141 DATE meir rozenchtroch 27169 27186 PERSON israel 27237 27243 GPE syria 27248 27253 GPE 1974 27257 27261 DATE every year 27292 27302 DATE naftali bennett 27354 27369 PERSON past years 27430 27440 DATE about one and a half million 27489 27517 CARDINAL israelis 27518 27526 NORP israelis 27608 27616 NORP “my 27766 27769 PERSON moshe muskal 27798 27810 PERSON 64 27812 27814 DATE israel 27865 27871 GPE lebanese 27880 27888 NORP hezbollah 27904 27913 ORG 2006 27917 27921 DATE this day 27924 27932 DATE israelis 28010 28018 NORP more than 15,000 28124 28140 CARDINAL saturday 28166 28174 DATE nearly 200 28180 28190 CARDINAL pacific island 28209 28223 LOC 300,000 28234 28241 CARDINAL saturday 28305 28313 DATE ’s cup of tea 28393 28406 EVENT the vanuatu cricket association 28412 28443 ORG shane deitz 28691 28702 PERSON the vanuatu cricket association 28727 28758 ORG the australian national team 28783 28811 ORG late last month 29127 29142 DATE april 6 29199 29206 DATE saturday 29293 29301 DATE a few feet 29381 29391 QUANTITY indonesia 29606 29615 GPE joko widodo 29785 29796 PERSON the republic of indonesia 29800 29825 GPE trump 29832 29837 PERSON friday 29847 29853 DATE us!”in 29929 29935 ORG joko’s 29947 29953 ORG fadjroel rachman 29965 29981 PERSON saturday 29994 30002 DATE the republic of indonesia mr 30067 30095 GPE president 30097 30106 PERSON indonesia 30109 30118 GPE 270 million 30141 30152 CARDINAL fourth 30169 30175 ORDINAL only about 8,400 30200 30216 CARDINAL 34 30292 30294 CARDINAL 720 30328 30331 CARDINAL second 30365 30371 ORDINAL east asia 30388 30397 LOC china 30404 30409 GPE unreported.mr 30461 30474 GPE the united states 30492 30509 GPE three 30536 30541 CARDINAL latin american 30542 30556 NORP ecuador 30569 30576 GPE el salvador 30578 30589 GPE honduras 30594 30602 GPE four 30705 30709 CARDINAL honduran 30986 30994 NORP juan orlando 31006 31018 PERSON trump 31050 31055 PERSON friday 31248 31254 DATE evening 31255 31262 TIME honduran 31295 31303 NORP trump 31319 31324 PERSON lara jakes 31476 31486 GPE anton troianovski 31488 31505 PERSON andrew e. kramer 31507 31523 PERSON kai schultz 31525 31536 PERSON richard c. paddock 31571 31589 PERSON mike ives 31604 31613 PERSON elian peltier 31628 31641 PERSON john schwartz 31643 31656 PERSON caitlin dickerson 31658 31675 PERSON liz alderman 31677 31689 PERSON elaine yu 31704 31713 PERSON hisako ueno 31715 31726 PERSON adam rasgon 31728 31739 PERSON adam nossiter 31741 31754 PERSON evan easterling 31756 31771 PERSON andrew lavallee 31773 31788 PERSON joshua barone 31790 31803 PERSON damien cave 31805 31816 PERSON jin wu 31818 31824 PERSON declan walsh 31826 31838 PERSON maggie haberman 31840 31855 PERSON jonathan martin 31857 31872 PERSON alexandra stevenson 31874 31893 PERSON raphael minder 31895 31909 PERSON latif dahir 31916 31927 PERSON nicholas kulish 31929 31944 PERSON ernesto londoño 31946 31961 PERSON letícia casado 31963 31977 ORG manuela andreoni 31979 31995 PERSON david gelles 32000 32012 PERSON wang yiwei 32014 32024 PERSON april 11 32054 32062 DATE american enterprise institute 32237 32266 ORG scott gottlieb 32275 32289 PERSON caitlin rivers 32291 32305 PERSON mark b. mcclellan 32307 32324 PERSON lauren silvis 32326 32339 PERSON crystal watson 32344 32358 PERSON four 32370 32374 CARDINAL at least 14 32736 32747 CARDINAL the american red cross 33066 33088 ORG more than 30,000 33168 33184 CARDINAL the past few weeks 33243 33261 DATE though.)if 33355 33365 GPE americans 33705 33714 NORP c.d.c. 33911 33917 GPE distancing.if 34232 34245 PERSON tested.it 34600 34609 PERSON the united states 34946 34963 GPE china 34965 34970 GPE europe 34975 34981 LOC american 34987 34995 NORP at least 12 35069 35080 CARDINAL two weeks 36059 36068 DATE six feet 36346 36354 QUANTITY the next five years.watching 36856 36884 DATE
#1.2.1 Count all the named entities in the document (0.2 points)
#1.2.2 Count the most frequent tokens for the entire document (0.2 points)
def CountFrequency(my_list):
# Creating an empty dictionary
freq = {}
for items in my_list:
freq[items] = my_list.count(items)
for key, value in freq.items():
print (key, value)
return freq
# Driver function
if __name__ == "__main__":
named_entities_count = CountFrequency(e_type)
token_count = CountFrequency(e_text)
GPE 95 PERSON 97 DATE 95 ORG 44 NORP 32 CARDINAL 66 LOC 12 TIME 7 ORDINAL 7 QUANTITY 4 EVENT 2 MONEY 9 PERCENT 3 WORK_OF_ART 1 advertisementbelarus 1 brazil 2 trump 12 daily 5 white house 1 americans 4 u.s. 9 state department 1 the coming days 1 the state department 1 65,000 1 indian 1 africa 4 american 6 at least 17,000 1 the united states 9 william walters 1 the state department’s 1 more than a million 1 pedro sánchez 1 saturday 15 evening 2 may 2 1 sánchez 2 spanish 1 sunday 2 first 3 mid-march 1 one hour 1 one kilometer 1 spain 2 378 1 367 1 friday 13 this week 4 may 9 1 below 10 1 half 1 17 1 covid-19 patients.as 1 this past week 2 nearly 53,000 1 3,670 1 jair bolsonaro 1 recent weeks 2 bolsonaro 4 abroad.he 1 november 1 the social liberal party 1 two 2 latin american 2 late march 1 belarus 2 belarusian 1 aleksandr g. lukashenko 1 ”at 1 germany 2 denmark 1 last weekend 1 earlier this month 1 nearly 9,000 1 63 1 ukraine 1 four 3 one 4 metopera.org 1 evening eastern time 1 more than 40 1 nine 1 baton rouge 1 la. 1 anna netrebko 1 vienna 1 peter gelb 1 earth 1 march 12 1 gelb 3 new york 1 yannick nézet-séguin 1 montreal 1 the opera house 1 every night 1 the past five weeks 1 30,000 1 button 1 britain 5 boris johnson 1 cummings 2 10 1 department of health and social care 1 20,000 3 150,000 1 fifth 1 stephen powis 1 the national health service 1 england 1 british 1 sky news 1 guardian 1 last year 1 the day 1 tens of thousands 1 8 a.m. 1 the website.the defense ministry 1 tom moore 1 a world war ii 1 1 1 michael ball 1 more than 2,837,000 1 at least 177 1 france 5 the netherlands 1 more taxpayer dollars 1 treasury department 1 an additional $9.5 billion 1 $2.2 trillion 1 congress 1 last month 1 $17 billion 1 netherlands 1 10 billion 1 about $10.8 billion 1 europe 2 €4 billion 1 french 2 €3 billion 1 late friday 1 dutch 2 an additional €2 billion to €4 billion 1 14 percent 1 the european commission 1 the european union 1 the bailout.the world health organization 1 italy 1 chile 1 14 1 only one 1 99 percent 1 china 5 angela merkel 1 emmanuel macron 1 about $689 billion 1 2018 2 the world bank 3 millions 2 their hours 1 about 20 percent 1 this year 1 mexico 1 third 1 india 1 mexicans 1 benefits.a 1 roy germano 1 new york university 1 germano 1 russian 3 connecticut 1 julia vasilenko 1 kandinsky’s musical and 1 the present day 1 lo-fi 1 more than half a million 1 just a few weeks 1 moscow 1 more than a third 1 los angeles 1 amsterdam 1 izoizolyacia’s 1 cairo 2 the middle east 2 egypt 2 the holy month 1 the united arab emirates 1 kuwait 1 saudi arabia 1 arab 1 about 3,900 1 100 million 1 islam 1 three 3 republicans 1 battleground states 1 michigan 1 pennsylvania 1 florida 1 fox news personalities.the white house 1 thursday 1 mike pence 1 the white house coronavirus task force 1 “what 1 white house news conferences 1 the time & effort!”here 1 about 1.2 million 1 a week 1 more than six 1 the second week of april 1 the new york times 1 the food and drug administration 1 the united nations 1 a nobel prize 1 april 15 1 dolgellau 1 88.the 1 hannah malcolm 1 twitter.a 1 the united nations intergovernmental 1 houghton 2 1990 1 1995 1 2001 1 2007 1 al gore 1 1994 1 10 or 20 years 1 about one million 1 9,000 square miles 1 djibouti 2 986 1 ismail omar guelleh 1 the red sea 1 the united states’ 1 measures.on 1 april 23 1 michael d. turello 1 okinawa city 1 japan 2 thousands 1 this month 2 10,000 1 16,000 1 a whole year 1 fukushima 1 mid-may 1 more than 13,500 1 341 1 israel 3 monday 1 tuesday 1 hundreds of thousands 1 memorial day 1 frida shniderman 1 72 1 meir rozenchtroch 1 syria 1 1974 1 every year 1 naftali bennett 1 past years 1 about one and a half million 1 israelis 3 “my 1 moshe muskal 1 64 1 lebanese 1 hezbollah 1 2006 1 this day 1 more than 15,000 1 nearly 200 1 pacific island 1 300,000 1 ’s cup of tea 1 the vanuatu cricket association 2 shane deitz 1 the australian national team 1 late last month 1 april 6 1 a few feet 1 indonesia 2 joko widodo 1 the republic of indonesia 1 us!”in 1 joko’s 1 fadjroel rachman 1 the republic of indonesia mr 1 president 1 270 million 1 fourth 1 only about 8,400 1 34 1 720 1 second 1 east asia 1 unreported.mr 1 ecuador 1 el salvador 1 honduras 1 honduran 2 juan orlando 1 lara jakes 1 anton troianovski 1 andrew e. kramer 1 kai schultz 1 richard c. paddock 1 mike ives 1 elian peltier 1 john schwartz 1 caitlin dickerson 1 liz alderman 1 elaine yu 1 hisako ueno 1 adam rasgon 1 adam nossiter 1 evan easterling 1 andrew lavallee 1 joshua barone 1 damien cave 1 jin wu 1 declan walsh 1 maggie haberman 1 jonathan martin 1 alexandra stevenson 1 raphael minder 1 latif dahir 1 nicholas kulish 1 ernesto londoño 1 letícia casado 1 manuela andreoni 1 david gelles 1 wang yiwei 1 april 11 1 american enterprise institute 1 scott gottlieb 1 caitlin rivers 1 mark b. mcclellan 1 lauren silvis 1 crystal watson 1 at least 14 1 the american red cross 1 more than 30,000 1 the past few weeks 1 though.)if 1 c.d.c. 1 distancing.if 1 tested.it 1 at least 12 1 two weeks 1 six feet 1 the next five years.watching 1
uniqueWords = []
for i in e_type:
if not i in uniqueWords:
uniqueWords.append(i);
#uniqueWords
print("Count of all the named entities in the document are:", len(uniqueWords))
Count of all the named entities in the document are: 14
nlp = en_core_web_sm.load()
doc = nlp(mytext.lower())
list_single_words = []
#Tokenize with spacy
#[(token, token.orth_, token.orth) for token in doc]
list_single_words = [token.orth_ for token in doc]
list_single_words
['advertisementbelarus', ',', 'led', 'by', 'a', 'virus', '-', 'denying', 'autocrat', ',', 'has', 'imposed', 'no', 'lockdowns', '.', 'brazil', '’s', 'leader', ',', 'who', 'has', 'played', 'down', 'the', 'virus', ',', 'faces', 'an', 'uncertain', 'future.right', 'nowafter', 'a', 'backlash', 'to', 'his', 'suggestion', 'that', 'injecting', 'disinfectants', 'could', 'combat', 'the', 'coronavirus', ',', 'president', 'trump', 'indicates', 'his', 'daily', 'white', 'house', 'briefings', 'are', 'no', 'longer', 'worth', 'his', 'time.many', 'americans', 'who', 'had', 'assumed', 'they', 'could', 'stay', 'overseas', 'till', 'the', 'pandemic', 'ebbed', 'now', 'face', 'an', 'unnerving', 'choice', ':', 'either', 'prepare', 'for', 'the', 'possibility', 'of', 'being', 'infected', 'and', 'treated', 'in', 'foreign', 'hospitals', ',', 'or', 'risk', 'infection', 'on', 'the', 'way', 'back', 'to', 'the', 'united', 'states.the', 'u.s', '.', 'state', 'department', ',', 'warning', 'that', 'commercial', 'flights', 'from', 'overseas', 'may', 'end', 'in', 'the', 'coming', 'days', ',', 'is', 'urging', 'americans', 'abroad', 'to', 'grab', 'any', 'opportunity', 'to', 'board', 'them.flights', 'organized', 'by', 'the', 'state', 'department', 'that', 'have', 'so', 'far', 'returned', '65,000', 'americans', 'from', 'across', 'the', 'world', 'are', 'winding', 'down', '.', 'some', 'continue', 'in', 'limited', 'numbers', ',', 'in', 'areas', 'like', 'the', 'indian', 'subcontinent', 'and', 'africa', '.', 'and', 'american', 'diplomats', 'have', 'helped', 'commercial', 'airlines', 'cut', 'through', 'foreign', 'regulations', 'that', 'have', 'restricted', 'flights', 'in', 'and', 'out', 'of', 'some', 'countries', 'during', 'the', 'pandemic', '.', 'but', 'there', 'still', 'are', 'at', 'least', '17,000', 'american', 'citizens', 'or', 'legal', 'residents', 'abroad', 'who', 'have', 'indicated', 'they', 'need', 'help.officials', 'are', 'suggesting', 'that', 'the', 'risk', 'of', 'contracting', 'the', 'virus', 'from', 'traveling', 'on', 'any', 'kind', 'of', 'flight', 'is', 'offset', 'by', 'disparities', 'in', 'health', 'care', 'systems', '.', '“you', 'can', 'come', 'back', 'to', 'the', 'united', 'states', 'where', 'you', 'are', 'a', 'citizen', 'and', 'you', 'have', 'access', 'to', 'health', 'care', 'and', 'you', 'have', 'access', 'to', 'an', 'infrastructure', 'that', 'is', 'still', 'intact', ',', '”', 'said', 'dr', '.', 'william', 'walters', ',', 'the', 'state', 'department', '’s', 'deputy', 'chief', 'medical', 'officer', '.', 'but', 'by', 'hunkering', 'down', ',', 'in', 'developing', 'countries', 'in', 'particular', ',', 'where', 'the', 'virus', 'has', 'yet', 'to', 'peak', ',', '“', 'you', 'will', 'be', 'an', 'american', 'citizen', 'in', 'a', 'foreign', 'country', 'that', 'did', 'n’t', 'have', 'great', 'infrastructure', 'to', 'begin', 'with', '.', '”and', 'students', 'studying', 'abroad', 'in', 'the', 'united', 'states', 'have', 'been', 'stranded', 'with', 'dwindling', 'financial', 'resources.many', 'of', 'the', 'more', 'than', 'a', 'million', 'international', 'students', 'who', 'left', 'their', 'home', 'countries', 'to', 'study', 'in', 'the', 'united', 'states', 'had', 'been', 'living', 'in', 'college', 'dorms', '.', 'they', 'were', 'left', 'to', 'find', 'new', 'housing', 'after', 'campuses', 'shutdown', '.', 'a', 'substantial', 'number', 'of', 'them', 'are', 'also', 'watching', 'their', 'financial', 'lives', 'fall', 'apart', '.', 'visa', 'restrictions', 'prevent', 'them', 'from', 'working', 'off', 'campus', ',', 'but', 'campuses', 'are', 'now', 'closed', '.', 'and', 'while', 'some', 'come', 'from', 'families', 'wealthy', 'enough', 'to', 'pay', 'for', 'their', 'housing', 'or', 'whisk', 'them', 'home', ',', 'many', 'others', 'had', 'already', 'been', 'struggling', 'to', 'cobble', 'together', 'tuition', 'fees', ',', 'which', 'tend', 'to', 'be', 'much', 'higher', 'for', 'international', 'students', '.', 'and', 'many', 'currencies', '’', 'values', 'have', 'collapsed', 'relative', 'to', 'the', 'u.s', '.', 'dollar.some', 'international', 'students', 'say', 'they', 'have', 'had', 'to', 'turn', 'to', 'food', 'banks', '.', 'others', 'are', 'couch', 'surfing', 'in', 'the', 'friends', '’', 'homes', ',', 'but', 'do', 'n’t', 'know', 'how', 'long', 'they', 'will', 'be', 'welcome', '.', 'those', 'who', 'were', 'able', 'to', 'fly', 'home', 'before', 'international', 'borders', 'closed', 'are', 'now', 'not', 'sure', 'they', 'will', 'be', 'able', 'to', 'come', 'back', 'to', 'finish', 'their', 'studies.prime', 'minister', 'pedro', 'sánchez', 'said', 'on', 'saturday', 'evening', 'that', 'spaniards', 'would', 'be', 'allowed', 'outdoors', 'to', 'take', 'a', 'stroll', 'or', 'exercise', 'as', 'of', 'may', '2', 'if', 'the', 'country', '’s', 'coronavirus', 'numbers', 'continued', 'to', 'improve', 'over', 'the', 'coming', 'week.in', 'a', 'televised', 'address', ',', 'mr', '.', 'sánchez', 'did', 'n’t', 'detail', 'what', 'limits', 'to', 'sports', 'would', 'still', 'apply', '.', 'but', 'his', 'announcement', 'came', 'as', 'spanish', 'children', 'prepared', 'to', 'go', 'outdoors', 'on', 'sunday', 'for', 'the', 'first', 'time', 'since', 'the', 'lockdown', 'came', 'into', 'force', 'in', 'mid', '-', 'march', '.', 'they', 'will', 'be', 'able', 'to', 'stroll', 'for', 'one', 'hour', 'within', 'one', 'kilometer', 'of', 'their', 'homes', 'when', 'accompanied', 'by', 'an', 'adult.mr', '.', 'sánchez', 'also', 'said', 'that', 'the', 'easing', 'spain', '’s', 'lockdown', 'would', 'be', '“', 'gradual', 'and', 'asymmetrical', ',', 'but', 'coordinated', '.', '”', 'some', 'regional', 'and', 'local', 'leaders', 'ague', 'that', 'their', 'areas', 'have', 'already', 'contained', 'the', 'epidemic', ',', 'and', 'they', 'are', 'pushing', 'to', 'reduce', 'lockdown', 'measures', 'locally', 'ahead', 'of', 'any', 'national', 'easing.spain', 'reported', 'a', 'slight', 'uptick', 'in', 'its', 'daily', 'death', 'toll', 'with', '378', 'dead', 'on', 'saturday', ',', 'compared', 'to', '367', 'on', 'friday', '.', 'but', 'the', 'country', 'crossed', 'a', 'significant', 'milestone', 'this', 'week', 'by', 'registering', 'more', 'hospital', 'recoveries', 'than', 'new', 'coronavirus', 'cases.the', 'country', '’s', 'lockdown', 'has', 'been', 'extended', 'until', 'may', '9', '.', 'but', 'some', 'local', 'and', 'regional', 'politicians', 'have', 'stressed', 'that', 'the', 'daily', 'number', 'of', 'coronavirus', 'fatalities', 'has', 'fallen', 'to', 'below', '10', 'a', 'day', 'in', 'half', 'of', 'spain', '’s', '17', 'regions', '.', 'the', 'push', 'for', 'local', 'easing', 'of', 'restrictions', 'has', 'particularly', 'come', 'from', 'islands', 'as', 'well', 'as', 'southern', 'regions', 'whose', 'hospitals', 'were', 'never', 'overwhelmed', 'with', 'covid-19', 'patients.as', 'brazil', '’s', 'coronavirus', 'contagion', 'accelerated', 'this', 'past', 'week', ',', 'with', 'nearly', '53,000', 'confirmed', 'cases', 'and', '3,670', 'deaths', ',', 'speculation', 'intensified', 'over', 'how', 'much', 'longer', 'president', 'jair', 'bolsonaro', 'would', 'last', 'in', 'power.in', 'recent', 'weeks', ',', 'mr', '.', 'bolsonaro', '’s', 'strikingly', 'dismissive', 'response', 'to', 'the', 'coronavirus', 'pandemic', ',', 'which', 'he', 'has', 'called', 'a', '“', 'measly', 'cold', '”', 'that', 'can', 'not', 'be', 'allowed', 'to', 'throttle', 'economic', 'growth', ',', 'generated', 'calls', 'for', 'impeachment', 'at', 'home', 'and', 'bewilderment', 'abroad.he', 'was', 'already', 'struggling', 'to', 'govern', 'effectively', 'when', ',', 'on', 'friday', ',', 'his', 'star', 'cabinet', 'minister', 'resigned', 'with', 'an', 'explosive', 'speech', 'that', 'basically', 'called', 'his', 'soon', '-', 'to', '-', 'be', 'former', 'boss', 'a', 'criminal.mr', '.', 'bolsonaro', 'became', 'a', 'president', 'without', 'a', 'political', 'party', 'in', 'november', ',', 'after', 'falling', 'out', 'with', 'leaders', 'of', 'the', 'social', 'liberal', 'party', ',', 'which', 'had', 'backed', 'his', 'presidential', 'bid', '.', 'and', 'several', 'political', 'allies', '—', 'including', 'two', 'of', 'mr', '.', 'bolsonaro', '’s', 'sons', '—', 'are', 'under', 'investigation', 'in', 'a', 'series', 'of', 'criminal', 'and', 'legislative', 'inquiries.given', 'those', 'challenges', ',', 'which', 'have', 'left', 'mr', '.', 'bolsonaro', 'deeply', 'isolated', ',', 'the', 'dramatic', 'exit', 'of', 'his', 'justice', 'minister', 'was', 'seen', 'by', 'critics', 'and', 'supporters', 'of', 'the', 'president', 'as', 'a', 'potentially', 'destructive', 'blow', 'to', 'his', 'grip', 'on', 'power', 'during', 'a', 'public', 'health', 'crisis', 'and', 'a', 'recession.and', ',', 'while', 'several', 'latin', 'american', 'leaders', 'have', 'seen', 'a', 'bounce', 'in', 'public', 'opinion', 'as', 'they', 'imposed', 'strict', 'quarantine', 'measures', 'to', 'curb', ...]
# #Tokenize words using regex
# #convert all text to lower case, and make sure to include punctuation in the n-gram models.
# #Make sure that you separate punctuation marks from text and treat them as tokens. Also treat numeric data as tokens.
# tokenizer = RegexpTokenizer(r"[^\W]+|[\.\?\!]")
# regexp_tokens = tokenizer.tokenize(mytext.lower())
# list_single_words = regexp_tokens
# #create a unigram
# #uniqueWords = set(list_single_words)
# list_single_words
def generateSentence(k):
words = ' '
limit = 3
limit = limit + 1
limit_count = 0
sentence_count = 0
for i in range(len(list_single_words)):
#print(list_single_words[i])
if (list_single_words[i] == '!') or (list_single_words[i] == '.') or (list_single_words[i] == '?'):
sentence_count +=1
if sentence_count > k:
if (list_single_words[i] != '!') or (list_single_words[i] != '.') or (list_single_words[i] != '?'):
words += list_single_words[i] + ' '
if (list_single_words[i] == '!') or (list_single_words[i] == '.') or (list_single_words[i] == '?'):
#print(words)
limit_count +=1
if limit_count == limit:
break
print(sentence_count)
sentence = words.strip()
sentence = sentence[1:].strip()
return sentence
#pick three consecutive sentences starting with Kth
#generate random integer for k
k = int(random.uniform(0, 200))
#
sentences = generateSentence(k)
sentences
186
'he said he had also asked for assistance in securing debt relief for poor countries and financial aid from international lenders.in a tweet on friday evening about his conversation with the honduran president , mr . trump said , “ we work closely together on the southern border . will be helping him with his request for ventilators and testing .'
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords
def pos_tagging(sentences):
sentences = sentences
docs = nlp(sentences.lower())
lemmatize = [(token, token.lemma, token.lemma_) for token in docs]
pos_tags = [(i, i.tag_) for i in docs]
return pos_tags, lemmatize
pos_tags, lemmatized = pos_tagging(sentences)
print(pos_tags)
print(lemmatized)
[(he, 'PRP'), (said, 'VBD'), (he, 'PRP'), (had, 'VBD'), (also, 'RB'), (asked, 'VBN'), (for, 'IN'), (assistance, 'NN'), (in, 'IN'), (securing, 'VBG'), (debt, 'NN'), (relief, 'NN'), (for, 'IN'), (poor, 'JJ'), (countries, 'NNS'), (and, 'CC'), (financial, 'JJ'), (aid, 'NN'), (from, 'IN'), (international, 'JJ'), (lenders.in, 'NNS'), (a, 'DT'), (tweet, 'NN'), (on, 'IN'), (friday, 'NNP'), (evening, 'NN'), (about, 'IN'), (his, 'PRP$'), (conversation, 'NN'), (with, 'IN'), (the, 'DT'), (honduran, 'NNP'), (president, 'NNP'), (,, ','), (mr, 'NNP'), (., 'NNP'), (trump, 'NNP'), (said, 'VBD'), (,, ','), (“, '``'), (we, 'PRP'), (work, 'VBP'), (closely, 'RB'), (together, 'RB'), (on, 'IN'), (the, 'DT'), (southern, 'JJ'), (border, 'NN'), (., '.'), (will, 'MD'), (be, 'VB'), (helping, 'VBG'), (him, 'PRP'), (with, 'IN'), (his, 'PRP$'), (request, 'NN'), (for, 'IN'), (ventilators, 'NNS'), (and, 'CC'), (testing, 'NN'), (., '.')] [(he, 561228191312463089, '-PRON-'), (said, 8685289367999165211, 'say'), (he, 561228191312463089, '-PRON-'), (had, 14692702688101715474, 'have'), (also, 12084876542534825196, 'also'), (asked, 203487227105936704, 'ask'), (for, 16037325823156266367, 'for'), (assistance, 13758060267895693543, 'assistance'), (in, 3002984154512732771, 'in'), (securing, 16480985536374789266, 'secure'), (debt, 16235366863958461733, 'debt'), (relief, 12027419396575871032, 'relief'), (for, 16037325823156266367, 'for'), (poor, 13558078257114079636, 'poor'), (countries, 12290671265767728302, 'country'), (and, 2283656566040971221, 'and'), (financial, 12019518237760591348, 'financial'), (aid, 11586757942818230145, 'aid'), (from, 7831658034963690409, 'from'), (international, 6392893036165415004, 'international'), (lenders.in, 9460573360514710984, 'lenders.in'), (a, 11901859001352538922, 'a'), (tweet, 18233253705271341989, 'tweet'), (on, 5640369432778651323, 'on'), (friday, 11784865196037645502, 'friday'), (evening, 4823652925149831785, 'evening'), (about, 942632335873952620, 'about'), (his, 561228191312463089, '-PRON-'), (conversation, 12275634602700366607, 'conversation'), (with, 12510949447758279278, 'with'), (the, 7425985699627899538, 'the'), (honduran, 2000225461946737733, 'honduran'), (president, 13696383780240996584, 'president'), (,, 2593208677638477497, ','), (mr, 2919978968283337964, 'mr'), (., 12646065887601541794, '.'), (trump, 15856857394754098415, 'trump'), (said, 8685289367999165211, 'say'), (,, 2593208677638477497, ','), (“, 15884554869126768810, '"'), (we, 561228191312463089, '-PRON-'), (work, 10038440415813069799, 'work'), (closely, 9696970313201087903, 'closely'), (together, 12060003407050460571, 'together'), (on, 5640369432778651323, 'on'), (the, 7425985699627899538, 'the'), (southern, 12121605977752639731, 'southern'), (border, 17575469328687188493, 'border'), (., 12646065887601541794, '.'), (will, 18307573501153647118, 'will'), (be, 10382539506755952630, 'be'), (helping, 17461235395181654430, 'help'), (him, 561228191312463089, '-PRON-'), (with, 12510949447758279278, 'with'), (his, 561228191312463089, '-PRON-'), (request, 440192745867607215, 'request'), (for, 16037325823156266367, 'for'), (ventilators, 6370701854268528591, 'ventilator'), (and, 2283656566040971221, 'and'), (testing, 2754665471364627735, 'testing'), (., 12646065887601541794, '.')]
docs = nlp(sentences)
sent_e_label = []
sent_e_text = []
for ent in docs.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
sent_e_label.append(ent.label_)
sent_e_text.append(ent.text)
print(sent_e_label, sent_e_text)
friday 143 149 DATE evening 150 157 TIME honduran 190 198 NORP trump 216 221 PERSON ['DATE', 'TIME', 'NORP', 'PERSON'] ['friday', 'evening', 'honduran', 'trump']
#import spacy
from spacy import displacy
docs = nlp(sentences.lower())
displacy.render(docs , style = 'ent', jupyter = True)
displacy.render(docs , style = 'dep', jupyter = True, options = {'distance': 100})
docs = nlp(mytext.lower())
displacy.render(docs , style = 'ent', jupyter = True)
displacy.render(docs , style = 'dep', jupyter = True, options = {'distance': 100})
def replace_person_names(token):
if token.ent_iob != 0 and token.ent_type_ == 'PERSON':
return '[REDACTED] '
return token.string
def redact_names(nlp_doc):
for ent in nlp_doc.ents:
ent.merge()
tokens = map(replace_person_names, nlp_doc)
return ''.join(tokens)
docs = nlp(mytext.lower())
redacted = redact_names(docs)
docs = nlp(redacted.lower())
displacy.render(docs , style = 'ent', jupyter = True)
displacy.render(docs , style = 'dep', jupyter = True, options = {'distance': 100})